Requirements:
NLTK
Specifications:
# Importing Libraries
import time
import nltk
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
stops = set(stopwords.words("english"))
start = time.time()
print("Exporting Oppia Gitter Chat")
!gitter-export-room id 561f4f4416b6c7089cb70f23 --token a5d7930a73eeaaed398112b1c18735db2da5a8cc > ../data/oppia_chat_2018_2_19.json
end = time.time()
print("Elapsed Time: ",end - start)
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
txt = text
txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
txt = re.sub(r'\n',r' ',txt)
if lowercase:
txt = " ".join([w.lower() for w in txt.split()])
if remove_stops:
txt = " ".join([w for w in txt.split() if w not in stops])
if stemming:
st = PorterStemmer()
txt = " ".join([st.stem(w) for w in txt.split()])
return txt
def Extract_Username(column):
temp_dict = dict(column)
return temp_dict['username']
def Extract_Issue_Number(column):
if column != []:
temp_dict = dict(column[0])
return temp_dict['number']
else:
return np.nan
def Extract_URLs(column):
if column != []:
temp_dict = dict(column[0])
return temp_dict[u'url']
else:
return np.nan
def Mentions(column):
mention_lst = ', '.join(re.findall("@([a-z0-9_]+)", column, re.I))
return mention_lst
chat = pd.read_json('../data/oppia_chat_2018_2_19.json')
chat.head()
The exported chat data is in JSON format. Unfortunately, pandas didnt process the inner dictionaries. The Gitter API defines the feature columns as shown below
id: ID of the message.
text: Original message in plain-text/markdown.
html: HTML formatted message.
sent: ISO formatted date of the message.
editedAt: ISO formatted date of the message if edited.
fromUser: (User)[user-resource] that sent the message.
unread: Boolean that indicates if the current user has read the message.
readBy: Number of users that have read the message.
urls: List of URLs present in the message.
mentions: List of @Mentions in the message.
issues: List of #Issues referenced in the message.
meta: Metadata. This is currently not used for anything.
v: Version.
gv: Stands for "Gravatar version" and is used for cache busting.
Lets clean the data and extract additional features for data analysis -
chat = chat[chat.columns[::-1]]
text = chat['text'].copy()
chat['username'] = chat['fromUser'].apply(Extract_Username)
chat['issue number'] = chat['issues'].apply(Extract_Issue_Number)
chat['url'] = chat['urls'].apply(Extract_URLs)
chat['mentions'] = chat['text'].apply(Mentions)
chat['mentions'].replace(to_replace ='', value= np.nan, inplace = True )
chat['sent'] = pd.to_datetime(chat['sent'])
chat['editedAt'] = pd.to_datetime(chat['editedAt'])
chat['question_marks'] = chat['text'].apply(lambda comment: comment.count('?'))
chat['Year'] = chat['sent'].dt.year
chat['Month'] = chat['sent'].dt.month
chat['Quarter'] = chat['sent'].dt.quarter
chat['DOW'] = chat['sent'].dt.dayofweek
chat['Day'] = chat['sent'].dt.day
chat['Hour'] = chat['sent'].dt.hour
cols_to_drop = ["fromUser","html","issues","meta","status","unread","urls"]
chat.drop(cols_to_drop, axis=1, inplace=True)
chat['text'] = chat['text'].apply(cleanData, lowercase = True, remove_stops = True, stemming = True)
chat.head()
Nice! Now we have usernames and issue numbers and other useful data
chat_df = chat
chat_df.pivot_table(values='readBy',
index='Year', columns='Month')
plt.figure(figsize=(20,10))
sns.heatmap(chat_df.pivot_table(values='readBy',
index='Year', columns='Month') , cmap='inferno', annot = True)
plt.title("ReadBy Count Heatmap Timeline", fontsize=20)
chat_df.groupby(by='Hour').mean().head()
plt.title('Groupby Hour.mean()', fontsize=20)
chat_df.groupby(by='Hour').mean()['question_marks'].plot(figsize=(20,5), kind='line', color='gold')
plt.xlabel('Hour', fontsize=20)
plt.ylabel('Number of Questions', fontsize=20)
plt.title('GroupBy DOW', fontsize=20)
chat_df.groupby(by='DOW').mean()['question_marks'].plot(figsize=(20,5), kind='line', color='purple')
plt.xlabel('Hour', fontsize=20)
plt.ylabel('Number of Questions', fontsize=20)
plt.title('GroupBy Month', fontsize=20)
chat_df.groupby(by='Month').mean()['question_marks'].plot(figsize=(20,5), kind='line', color='red')
plt.xlabel('Hour', fontsize=20)
plt.ylabel('Number of Questions', fontsize=20)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.spatial.distance import cdist
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,4), min_df=5)
tfidfdata = tfidfvec.fit_transform(chat_df['text'])
feature_names = tfidfvec.get_feature_names()
len(feature_names)
pd.DataFrame(feature_names).transpose()
# create dataframe for features
tfidf_df = pd.DataFrame(tfidfdata.todense())
tfidf_df.columns = feature_names
X = tfidf_df
from tqdm import tqdm
# k means determine k
distortions = []
K = range(1,6)
for k in tqdm(K):
kmeanModel = KMeans(n_clusters=k).fit(X)
kmeanModel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
K
distortions
# Plot the elbow
plt.figure(figsize=(10,6))
plt.plot(K, distortions, 'bx-', color='gold')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
kmeans = KMeans(n_clusters=2)
kmeans.fit(X=tfidf_df)
cluster = pd.DataFrame(chat[['text','mentions','username','question_marks']])
cluster['cluster'] = kmeans.predict(tfidf_df)
cluster['text'] = chat['text']
cluster.head()
import matplotlib.gridspec as gridspec
from wordcloud import WordCloud , STOPWORDS
from PIL import Image
import matplotlib_venn as venn
stopword=set(STOPWORDS)
text=cluster[cluster['cluster'] == 0].text.values
wc= WordCloud(background_color="black", max_words=2000, stopwords=stopword, height=800, width=1500)
wc.generate(" ".join(text))
plt.figure(figsize=(20,12))
plt.axis("off")
plt.title("Words in Cluster 0 ~ Questions" , fontsize=30)
plt.imshow(wc.recolor(colormap= 'inferno' , random_state=17), alpha=1)
plt.show()
stopword=set(STOPWORDS)
text=cluster[cluster['cluster'] == 1].text.values
wc= WordCloud(background_color="black", max_words=2000, stopwords=stopword, height=800, width=1500)
wc.generate(" ".join(text))
plt.figure(figsize=(20,12))
plt.axis("off")
plt.title("Words in Cluster 1 ~ Golden Words" , fontsize=30)
plt.imshow(wc.recolor( random_state=17), alpha=1)
plt.show()
cluster[cluster['question_marks'] > 0].head(20)
cluster[cluster['question_marks'] > 0].cluster.value_counts()
print "Number of Questions asked:" , chat[chat['question_marks']>0].text.shape[0]
print "Total number of comments:" , chat.shape[0]
print "Percentage of Questions:", (chat[ chat['question_marks']>0].text.shape[0] / float(chat.shape[0]))*100.0
Yay! We have successfully classified texts into 2 clusters. The cluster 0 = Text with questions and cluster 1 comes under gratitude and FYI texts
MostActiveUsers = pd.DataFrame(chat_df.username.value_counts()[:10]).reset_index()
plt.figure(figsize=(20,12))
sns.barplot(y=MostActiveUsers['index'], x=MostActiveUsers.username )
stopword=set(STOPWORDS)
text=chat_df.text.values
wc= WordCloud(background_color="black", max_words=2000, stopwords=stopword, height=800, width=1500)
wc.generate(" ".join(text))
plt.figure(figsize=(20,12))
plt.axis("off")
plt.title("Words frequented in Oppia Gitter Chat", fontsize=30)
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=1)
plt.show()
text=chat_df[chat_df['username'] == 'bbriggs'].text.values
wc= WordCloud(background_color="black", max_words=2000, stopwords=stopword, height=800, width=1500)
wc.generate(" ".join(text))
plt.figure(figsize=(20,12))
plt.axis("off")
plt.title("Words used by BBriggs at Oppia Gitter Chat", fontsize=30)
plt.imshow(wc.recolor(colormap= 'inferno' , random_state=17), alpha=1)
plt.show()
text=chat_df[chat_df['username'] == 'seanlip'].text.values
wc= WordCloud(background_color="black", max_words=2000, stopwords=stopword, height=800, width=1500)
wc.generate(" ".join(text))
plt.figure(figsize=(20,12))
plt.axis("off")
plt.title("Words used by Sean at Oppia Gitter Chat", fontsize=30)
plt.imshow(wc.recolor(colormap= 'inferno' , random_state=17), alpha=1)
plt.show()
text=chat_df[chat_df['username'] == 'shaz13'].text.values
wc= WordCloud(background_color="black", max_words=2000, stopwords=stopword, height=800, width=1500)
wc.generate(" ".join(text))
plt.figure(figsize=(20,12))
plt.axis("off")
plt.title("Words used by Shaz at Oppia Gitter Chat", fontsize=30)
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17), alpha=1)
plt.show()
plt.title('Mean of No. Questions asked', fontsize=20)
chat_df.groupby(by='Day').mean()['question_marks'].plot(figsize=(20,10), kind='bar', width=.9, color='gold')
plt.xlabel('Day', fontsize=20)
plt.ylabel('Number of Questions', fontsize=20)
plt.title('Mean of No. Questions asked', fontsize=20)
chat_df.groupby(by='Month').mean()['question_marks'].plot(figsize=(20,10), kind='bar', width= .4, color='grey')
plt.xlabel('Month', fontsize=20)
plt.ylabel('Number of Questions', fontsize=20)
plt.title('Mean of No. Questions asked', fontsize=20)
chat_df.groupby(by='Hour').mean()['question_marks'].plot(figsize=(20,10), kind='bar', width=1, color='gold')
plt.xlabel('Hour', fontsize=20)
plt.ylabel('Number of Questions', fontsize=20)
chat_df['issue number'].value_counts().head()